/**********************************************************************/
/*
      Title: clean_podes.do
			Authors: Robbie Dulin, Clotaire Boyer
			Created: 7 Jan 2020
    	Description: Cleans 2018 PODES and creates variables for inclusion in
        lasso pools.
*/
/**********************************************************************/

/*----------------------------------------------------*/
                  /* Section 0: Setup */
/*----------------------------------------------------*/

version 14.2
clear
set more off

/*----------------------------------------------------*/
               /* Section: PODES 2018 */
/*----------------------------------------------------*/

cap log close
local prefix: display %tdCYND td(`c(current_date)')
log using "$log/`prefix'_podes_clean", replace text


*** Import part 1 data
u "$importable/podes2018_desa_part1_revisi.dta", clear

** merge in part 3 data
merge 1:1 R101 R102 R103 R104 using "$importable/podes2018_desa_part3_revisi.dta", assert(3) nogen

** merge in part 4 data
merge 1:1 R101 R102 R103 R104 using "$importable/podes2018_desa_part4_revisi.dta", assert(3) nogen

qui foreach var of varlist R101 R102 R103 R104 {
  destring `var', replace
}

* number of households (sum of PLN, non-PLN, and non-electricity HHs)
////summ R501A1 R501A2 R501B
egen num_hh_desa = rowtotal(R501A1 R501A2 R501B)
//summ num_hh_desa

egen total_hh = total(num_hh_desa)
format total_hh %12.0fc
//summ total_hh
di `r(mean)'
drop total_hh
// 77,412,096 seems like too many households, is this families?

// Question 401 does not appear to be in the data
//tab  R403A
//tab  R403B


* Gov Bank

  * Number

gen himbara_bank = R1208AK2 > 0 & R1208AK2 < 200

* Distance

gen dist_himbara_bank = R1208AK3
replace dist_himbara_bank = 0 if himbara_bank == 1


  * Ease of reaching gov bank if none in village

//tab  R1208AK4
gen veasy_reach_gov = R1208AK4 == 1
replace veasy_reach_gov = 1 if himbara_bank == 1
gen easy_reach_gov = R1208AK4 == 2
replace easy_reach_gov = 1 if veasy_reach_gov == 1
gen diff_reach_gov = R1208AK4 == 3
replace diff_reach_gov = 1 if easy_reach_gov == 1
gen vdiff_reach_gov = R1208AK4 == 4
replace vdiff_reach_gov = 1 if diff_reach_gov == 1
gen miss_reach_gov = R1208AK4 == .


* Private Bank

  * N

gen private_bank = R1208BK2 > 0 // No NAs fine to use ineq

  * Distance

gen dist_private_bank = R1208BK3
replace dist_private_bank = 0 if private_bank == 1



  * Ease of reaching private bank if none in village

//tab  R1208BK4
gen veasy_reach_priv = R1208BK4 == 1
replace veasy_reach_priv = 1 if private_bank == 1
gen easy_reach_priv = R1208BK4 == 2
replace easy_reach_priv = 1 if veasy_reach_priv == 1
gen diff_reach_priv = R1208BK4 == 3
replace diff_reach_priv = 1 if easy_reach_priv ==1
gen vdiff_reach_priv = R1208BK4 == 4
replace vdiff_reach_priv = 1 if diff_reach_priv ==1
gen miss_reach_priv = R1208BK4 == .

* ATM

  * N

gen atm = R1209CK2 == 1

* Distance

gen distance_atm = R1209CK3
replace distance_atm = 0 if atm == 1



* ease of reaching bank atm  if none in village

gen veasy_reach_atm = R1209CK4 == 1
replace veasy_reach_atm = 1 if atm == 1
gen easy_reach_atm = R1209CK4 == 2
replace easy_reach_atm = 1 if veasy_reach_atm == 1
gen diff_reach_atm = R1209CK4 == 3
replace diff_reach_atm = 1 if easy_reach_atm == 1
gen vdiff_reach_atm = R1209CK4 == 4
replace vdiff_reach_atm = 1 if diff_reach_atm == 1
gen miss_reach_atm = R1209CK4 == .


* Bank agent in village

   * N

//tab  R1209GK2
gen agent = R1209GK2 == 1

  * Distance
// maybe the distances are top coded at 99? Should I just leave as is?

gen distance_agent = R1209GK3
replace distance_agent = 0 if agent == 1



* ease of reaching bank agent  if none in village
//tab  R1209GK4
gen veasy_reach_agent = R1209GK4 == 1
replace veasy_reach_agent = 1 if agent == 1
gen easy_reach_agent = R1209GK4 == 2
replace easy_reach_agent = 1 if veasy_reach_agent == 1
gen diff_reach_agent = R1209GK4 == 3
replace diff_reach_agent = 1 if easy_reach_agent == 1
gen vdiff_reach_agent = R1209GK4 == 4
replace vdiff_reach_agent = 1 if diff_reach_agent == 1
gen miss_reach_agent = R1209GK4 == .


* Type g activity, number, distance, accessibility (h in 2018)

rename R1206HK2 typeg_n
rename R1206HK3 typeg_km
replace typeg_km = 0 if typeg_n == 1

gen typeg_km_missing = typeg_km ==. // trick to avoid loosing observations
replace typeg_km = 0 if typeg_km_missing == 1 //from controls later in analysis

* Type f activity, number, distance, accessibility (G IN 2018)

rename R1206GK2 typef_n
rename R1206GK3 typef_km
replace typef_km = 0 if typef_n == 1

gen typef_km_missing = typef_km ==. // trick to avoid loosing observations
replace typef_km = 0 if typef_km_missing == 1 //from controls later in analysis

* Type e activity, number, distance, accessibility

rename R1206EK2 typee_n
rename R1206EK3 typee_km
replace typee_km = 0 if typee_n == 1

gen typee_km_missing = typee_km ==. // trick to avoid loosing observations
replace typee_km = 0 if typee_km_missing == 1 //from controls later in analysis

* Type j activity, number, distance, accessibility

rename R1206F1K2 typej_n
rename R1206F1K3 typej_km
replace typej_km = 0 if typej_n == 1

gen typej_km_missing = typej_km ==. // trick to avoid loosing observations
replace typej_km = 0 if typej_km_missing == 1 //from controls later in analysis

* Pool e and g: food beverage shops and minimarkets together
egen typege_n = rowtotal(typeg_n typee_n)

* dummy variable for rice-producing agriculture
gen rice_agri = R403A == 1 & R403B == 1
//summ rice_agri

* dummy variable for non-rice-producing agriculture
gen non_rice_agri = R403A == 1 & R403B != 1
//summ non_rice_agri

* dummy variable for non-agricultural village
gen non_agri = R403A != 1
//summ non_agri

* dummy for missing
gen miss_agri = R403A == .
//summ *agri

** dummy variables for transport types
//tab  R403C1

* asphalt road
gen asphalt_road_transport = R403C1 == 1

* gravel road
gen gravel_road_transport = R403C1 == 2

* land transport
gen land_transport = R403C1 == 3

* water transport
gen water_transport = R403C1 == 4

* other transport
gen other_transport = R403C1 == 5

* missing transport type
gen miss_transport = R403C1 == .

//summ *transport

** road passability
//tab  R403C2

* road passable throughout the year
gen road_passable_always = R403C2 == 1

* road passable usually, except at certain times
gen road_passable_usually = R403C2 == 2

* road passable in dry season
gen road_passable_dry_season = R403C2 == 3

* road impassable
gen road_passable_never = R403C2 == 4

* road passable missing
gen road_passable_missing = R403C2 == .
//summ road_passable*

* distance to camat
// NOTE: There are many implausible values in this variable. There does not appear to be a clear cutoff,
//       so for now I will winsorize to 1st and 99th percentiles
//summ R1002AK5
_pctile R1002AK5, p(1 99)
gen distance_camat = R1002AK5
replace distance_camat = `r(r1)' if distance_camat < `r(r1)'
replace distance_camat = `r(r2)' if distance_camat > `r(r2)' & distance_camat != .
recode distance_camat (. = 0)
//summ distance_camat

// missing indicator
gen distance_camat_miss = R1002AK5 == .
//tab  distance_camat_miss


* Total time to camat
// Calculate total time in minutes
gen time_to_camat_raw = R1002AME + (R1002AJM * 60)
//summ time_to_camat_raw
// winsorize to 1st and 99th percentile
_pctile time_to_camat_raw, p(1 99)
gen time_to_camat = time_to_camat_raw
replace time_to_camat = `r(r1)' if time_to_camat < `r(r1)'
replace time_to_camat = `r(r2)' if time_to_camat > `r(r2)' & time_to_camat != .
recode time_to_camat (. = 0)

// missing indicator
gen time_to_camat_miss = time_to_camat_raw == .
//tab  time_to_camat_miss
drop time_to_camat_raw

* Fees to travel to camat
//summ R1002AK7
gen fees_camat = R1002AK7
_pctile fees_camat, p(1 99)
replace fees_camat = `r(r1)' if fees_camat < `r(r1)'
replace fees_camat = `r(r2)' if fees_camat > `r(r2)' & fees_camat != .
recode fees_camat (. = 0)

// missing indicator
gen fees_camat_miss = R1002AK7 == .
//tab  fees_camat_miss

* cell phone quality in most urban and rural areas
//tab  R1005C
gen vstrong_cell_signal = R1005C == 1
gen strong_cell_signal = R1005C == 2
gen weak_cell_signal = R1005C == 3
gen no_cell_signal = R1005C == 4
gen miss_cell_signal = R1005C == .
//summ *cell_signal

* internet signal quality in most areas in the village
//tab  R1005D
gen internet_cell_4G = R1005D == 1
gen internet_cell_3G = R1005D == 2
gen internet_cell_25G = R1005D == 3
gen internet_cell_none = R1005D == 4
gen internet_cell_miss = R1005D == .
//summ internet*

* hetero vars for internet/cell signal
gen hi_phone_sig = R1005C == 1 | R1005C == 2
gen hi_int_sig = R1005D == 1 | R1005D == 2
//summ hi*

* land under rice cultivation is missing!


** heterogeneity: travel time to capital (minutes)
gen time_camat = R1002AME + (R1002AJM * 60)
gen time_bupati = R1002BME + (R1002BJM * 60)
gen time_other_camat = R1002CME + (R1002CJM * 60)
gen time_other_bupati = R1002DME + (R1002DJM * 60)
//summ time_camat time_other_camat time_bupati time_other_bupati
count if time_other_camat < time_camat
count if time_bupati > time_other_bupati

* min of travel times
gen min_time_camat = min(time_camat, time_other_camat)
gen min_time_bupati = min(time_bupati, time_other_bupati)

* windsorize to 99.5 pct
gen min_time_camat_wins = min_time_camat
_pctile min_time_camat, p(0.5 99.5)
replace min_time_camat_wins = `r(r1)' if min_time_camat < `r(r1)'
replace min_time_camat_wins = `r(r2)' if min_time_camat > `r(r2)' & min_time_camat != .
//summ min_time_camat_wins min_time_camat

gen min_time_bupati_wins = min_time_bupati
_pctile min_time_bupati, p(0.5 99.5)
replace min_time_bupati_wins = `r(r1)' if min_time_bupati < `r(r1)'
replace min_time_bupati_wins = `r(r2)' if min_time_bupati > `r(r2)' & min_time_bupati != .
//summ min_time_bupati_wins min_time_bupati

drop time_camat time_other_camat time_bupati time_other_bupati min_time_camat min_time_bupati
rename *_wins *
//summ min*

// village head education level
//tab  R1701AK5
gen desa_head_edu18 = R1701AK5

// we will add education of head HH at baseline so we can apply the 0 replacement and dummy for NAs
  gen desa_head_edu18_missing = R1701AK5 == .
  replace desa_head_edu18 = 0 if R1701AK5 == .





** heterogeneity: road conditions
//tab  R1001A R1001B1
gen nonasphalt = R1001B1 != 1
//tab  nonasphalt

//tab  R1001B2
gen roadimpass = R1001B2 != 1
//tab  roadimpass

    // Number of each branch of bank A,B,C types is all 0 and no bank agent is present in village
gen no_bankaccess = ((R1208AK2 == 0) & (R1208BK2 == 0) & (R1208CK2 == 0) & (R1209GK2 == 2))
//tab  no_bankaccess



** save
  keep  R101 R102 R103 R104 *gov* *agent* *atm* *himbara* *priv* typej* typeg* typef* typee* *agri *transport road_passable* roadimpass nonasphalt distance_camat* time_to_camat* fees_camat* *cell* min* num_hh_desa no_bankaccess *desa_head* hi* desa_head_edu18 desa_head_edu18_missing
  foreach var of varlist *agri *transport road_passable* distance_camat* time_to_camat* fees_camat* *cell* distance_agent agent atm distance_atm *himbara* *private* desa_head_edu18 desa_head_edu18_missing no_bankaccess {
    rename `var' `var'_podes
  }

  di c(k)
  foreach var of varlist *podes {
    assert `var' != .
  }
  foreach var of varlist  typeg* typef* typee* typej* {
    rename `var' `var'_podes18
  }

  save "$cleaned/podes_2018.dta", replace
  //summ roadimpass nonasphalt



/*----------------------------------------------------*/
                  /* Section: PODES 2019 */
/*----------------------------------------------------*/

u "$importable/podes_2019_b1b13.dta", clear

** Small buisnesses recovering from set of various_activities

* Type g activity, number, distance, accessibility (h in 2018)

rename R801GK2 typeg_n
rename R801GK3 typeg_km
gen typeg_veasy = R801GK4 == 1
replace typeg_veasy =. if R801GK4==.
gen typeg_easy = R801GK4 == 2
replace typeg_easy =. if R801GK4==.
gen typeg_hard = R801GK4 == 3
replace typeg_hard =. if R801GK4==.
gen typeg_vhard = R801GK4 == 4
replace typeg_vhard =. if R801GK4==.

* Type f activity, number, distance, accessibility (G IN 2018)

rename R801FK2 typef_n
rename R801FK3 typef_km
gen typef_veasy = R801FK4 == 1
replace typef_veasy =. if R801FK4==.
gen typef_easy = R801FK4==2
replace typef_easy =. if R801FK4==.
gen typef_hard = R801FK4 == 3
replace typef_hard =. if R801FK4==.
gen typef_vhard = R801FK4 == 4
replace typef_vhard =. if R801FK4==.


* Type e activity, number, distance, accessibility

rename R801EK2 typee_n
rename R801EK3 typee_km
gen typee_veasy = R801EK4 == 1
replace typee_veasy =. if R801EK4==.
gen typee_easy = R801EK4 == 2
replace typee_easy =. if R801EK4==.
gen typee_hard = R801EK4 == 3
replace typee_hard =. if R801EK4==.
gen typee_vhard = R801EK4 == 4
replace typee_vhard =. if R801EK4==.


* Type j activity, number, distance, accessibility

rename R801JK2 typej_n
rename R801JK3 typej_km
gen typej_veasy = R801JK4 == 1
replace typej_veasy =. if R801JK4==.
gen typej_easy = R801JK4 == 2
replace typej_easy =. if R801JK4==.
gen typej_hard = R801JK4 == 3
replace typej_hard =. if R801JK4==.
gen typej_vhard = R801JK4 == 4
replace typej_vhard =. if R801JK4==.

* Pooling activties of types g and e
egen typege_n = rowtotal(typeg_n typee_n)

gen no_bankaccess = ((R802A == 0) & (R802B == 0) & (R802C == 0) )
gen bankaccess = 1 - no_bankaccess

foreach var of varlist bankaccess {
  rename `var' `var'_podes19
}

*** new village head
** merge in 2018 (baseline) versions of variables
  qui foreach var of varlist R10? {
    destring `var', replace
  }
  merge 1:1 R101 R102 R103 R104 using "$cleaned/podes_2018.dta", keepusing(*_podes18)
  // NOTE: going forward, will need to create crosswalk bewteen PODES 2018 and 2019
  drop if _m == 2
  rename _m merge18

** save
  keep R101 R102 R103 R104  typeg* typef* typee* typej* bankaccess_podes19
  //summ


  save "$cleaned/podes_2019.dta", replace

cap log close
